Load libraries and data
easypackages::libraries("here","ggplot2","caret","e1071","pheatmap","reshape2","NbClust","grid","patchwork","readxl","patchwork","WGCNA","psych","nlme")
source(here("code","ndar_functions.R"))
source(here("code","euaims_functions.R"))
source(here("code","get_ggColorHue.R"))
options(stringsAsFactors = FALSE)
# Z-score threshold to use for subtyping
z_thresh = 0.6
codepath = here("code")
datapath = here("data")
figpath = here("figures")
resultpath = here("results","ndar")
plotpath = here("plots","ndar")
# read in data
Dverbal_Discovery = read.csv(file.path(datapath,"tidy_verbal_disc.csv"))
Dverbal_Replication = read.csv(file.path(datapath,"tidy_verbal_rep.csv"))
make_subtype <- function(data2use, z_thresh, mean2use=NULL, sd2use=NULL){
# compute difference score
vars2use = c("dbaes_atotal","dbaes_btotal")
diff_score = data2use[,vars2use[1]] - data2use[,vars2use[2]]
# compute mean and sd if necessary
if (is.null(mean2use)){
mean2use = mean(diff_score)
} # if (is.null(mean2use))
if (is.null(sd2use)){
sd2use = sd(diff_score)
} # if (is.null(sd2use))
# compute z-score
data2use$z_ds = (diff_score - mean2use)/sd2use
# make subtype factor
data2use$z_ds_group = "SC_equal_RRB"
data2use$z_ds_group[data2use$z_ds>z_thresh] = "SC_over_RRB"
data2use$z_ds_group[data2use$z_ds<(z_thresh*-1)] = "RRB_over_SC"
data2use$z_ds_group = factor(data2use$z_ds_group)
return(data2use)
} # function make_subtype
vars2use = c("dbaes_atotal","dbaes_btotal")
# compute Discovery mean and sd
ds_disc = Dverbal_Discovery[,vars2use[1]] - Dverbal_Discovery[,vars2use[2]]
mean2use = mean(ds_disc)
sd2use = sd(ds_disc)
Dverbal_Discovery = make_subtype(data2use = Dverbal_Discovery,
z_thresh = z_thresh,
mean2use = mean2use,
sd2use = sd2use)
# compute Replication mean and sd
ds_rep = Dverbal_Replication[,vars2use[1]] - Dverbal_Replication[,vars2use[2]]
# mean2use = mean(ds_rep)
# sd2use = sd(ds_rep)
Dverbal_Replication = make_subtype(data2use = Dverbal_Replication,
z_thresh = z_thresh,
mean2use = mean2use,
sd2use = sd2use)
maxScores = c(3,4)
p_disc = ggplot(data = Dverbal_Discovery, aes(x = dbaes_atotal, y = dbaes_btotal, colour = z_ds_group)) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,1) + xlim(0,1) + ggtitle("NDAR Discovery")
p1_top_left = p_disc + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Disc_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p1_top_left)
p_disc
table(Dverbal_Discovery$z_ds_group)
##
## RRB_over_SC SC_equal_RRB SC_over_RRB
## 248 405 236
cor_res = cor.test(Dverbal_Discovery$dbaes_atotal, Dverbal_Discovery$dbaes_btotal)
cor_res
##
## Pearson's product-moment correlation
##
## data: Dverbal_Discovery$dbaes_atotal and Dverbal_Discovery$dbaes_btotal
## t = 6.6829, df = 887, p-value = 4.134e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1554332 0.2806578
## sample estimates:
## cor
## 0.2189469
p_rep = ggplot(data = Dverbal_Replication, aes(x = dbaes_atotal, y = dbaes_btotal, colour = z_ds_group)) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,1) + xlim(0,1) + ggtitle("NDAR Replication")
p2_bottom_left = p_rep + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Rep_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p2_bottom_left)
p_rep
table(Dverbal_Replication$z_ds_group)
##
## RRB_over_SC SC_equal_RRB SC_over_RRB
## 250 420 220
cor_res = cor.test(Dverbal_Replication$dbaes_atotal, Dverbal_Replication$dbaes_btotal)
cor_res
##
## Pearson's product-moment correlation
##
## data: Dverbal_Replication$dbaes_atotal and Dverbal_Replication$dbaes_btotal
## t = 7.1459, df = 888, p-value = 1.864e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1700824 0.2943934
## sample estimates:
## cor
## 0.2331903
# run validation
# make subtypes using z-scores computed from the mean and sd of the training set
train_data = Dverbal_Discovery
test_data = Dverbal_Replication
mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
tmp_train = make_subtype(data2use = train_data,
z_thresh = z_thresh,
mean2use = mean2use,
sd2use = sd2use)
# mean2use = mean(test_data[,vars2use[1]] - test_data[,vars2use[2]])
# sd2use = sd(test_data[,vars2use[1]] - test_data[,vars2use[2]])
tmp_test = make_subtype(data2use = test_data,
z_thresh = z_thresh,
mean2use = mean2use,
sd2use = sd2use)
# compute model
mod2use = svm(x = tmp_train[,vars2use], y = tmp_train$z_ds_group)
pred_labels = predict(mod2use, tmp_test[,vars2use])
confmat = table(tmp_test$z_ds_group,pred_labels)
acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
# plot confusion matrix
setHook("grid.newpage", function() pushViewport(viewport(x=1,y=1,width=0.9, height=0.9, name="vp", just=c("right","top"))), action="prepend")
pheatmap(confmat/rowSums(confmat)*100, display_numbers = TRUE, color = colorRampPalette(c('white','red'))(100), cluster_rows = FALSE, cluster_cols = FALSE, fontsize_number = 15,labels_row = c("RRB>SC","SC=RRB","SC>RRB"),labels_col = c("RRB>SC","SC=RRB","SC>RRB"),angle_col=90)
setHook("grid.newpage", NULL, "replace")
grid::grid.text("Actual Labels", y=-0.07, gp=gpar(fontsize=16))
grid::grid.text("Predicted Labels", x=-0.07, rot=90, gp=gpar(fontsize=16))
# show accuracy
true_accuracy = acc
true_accuracy
## [1] 0.9786517
nperm = 10000
# make subtypes using z-scores computed from the mean and sd of the training set
train_data = Dverbal_Discovery
test_data = Dverbal_Replication
mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
tmp_train = make_subtype(data2use = train_data,
z_thresh = z_thresh,
mean2use = mean2use,
sd2use = sd2use)
# mean2use = mean(test_data[,vars2use[1]] - test_data[,vars2use[2]])
# sd2use = sd(test_data[,vars2use[1]] - test_data[,vars2use[2]])
tmp_test = make_subtype(data2use = test_data,
z_thresh = z_thresh,
mean2use = mean2use,
sd2use = sd2use)
acc = vector(length = nperm)
for (iperm in 1:nperm){
# set seed for reproducibility
set.seed(iperm)
# compute model
permuted_labels = sample(tmp_train$z_ds_group)
mod2use = svm(x = tmp_train[,vars2use], y = permuted_labels)
pred_labels = predict(mod2use, tmp_test[,vars2use])
confmat = table(tmp_test$z_ds_group,pred_labels)
acc[iperm] = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
} # for (iperm in 1:nperm)
df2plot = data.frame(Accuracy = acc)
p = ggplot(data = df2plot, aes(x = Accuracy)) + geom_histogram() + geom_vline(xintercept=true_accuracy)
p
# compute p-value
pval = sum(c(true_accuracy,acc)>=true_accuracy)/(nperm+1)
pval
## [1] 9.999e-05
maxScores = c(3,4)
# Discovery - make plot with all individuals shown as lines
df2use = Dverbal_Discovery[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(tmp_train$z_ds_group)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal
df4plot = melt(df2use,
id.vars = c("subjectkey","subgrp"),
measure.vars = c("SC","RRB"))
p = ggplot(data = df4plot, aes(x = variable,
y = value,
colour = subgrp,
group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + ylim(0,1) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p3_middle_top = p + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Disc_jitterplot_z%s.pdf",as.character(z_thresh))), plot = p3_middle_top)
p
maxScores = c(3,4)
# Replication - make plot with all individuals shown as lines
df2use = Dverbal_Replication[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(tmp_test$z_ds_group)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal
df4plot = melt(df2use,
id.vars = c("subjectkey","subgrp"),
measure.vars = c("SC","RRB"))
p = ggplot(data = df4plot, aes(x = variable,
y = value,
colour = subgrp,
group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + ylim(0,1) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p4_middle_bottom = p + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Rep_jitterplot_z%s.pdf",as.character(z_thresh))), plot = p4_middle_bottom)
p
# make SC and RRB percentages since EU-AIMS data is specified as percentages
Dverbal = read.csv(file.path(datapath,"tidy_verbal.csv"))
euaims_data = read.csv(file.path(datapath,"tidy_euaims.csv"))
mask1 = euaims_data$Diagnosis=="ASD"
mask2 = (is.na(euaims_data$A1_pct_severity) | is.na(euaims_data$A2_pct_severity) | is.na(euaims_data$A3_pct_severity) | is.na(euaims_data$B1_pct_severity) | is.na(euaims_data$B2_pct_severity) | is.na(euaims_data$B3_pct_severity) | is.na(euaims_data$B4_pct_severity))
euaims_data = subset(euaims_data, (mask1 & !mask2))
Dverbal[,vars2use[1]] = Dverbal[,vars2use[1]]
Dverbal[,vars2use[2]] = Dverbal[,vars2use[2]]
euaims_data[,vars2use[1]] = (euaims_data$A1_pct_severity +
euaims_data$A2_pct_severity +
euaims_data$A3_pct_severity)/3
euaims_data[,vars2use[2]] = (euaims_data$B1_pct_severity +
euaims_data$B2_pct_severity +
euaims_data$B3_pct_severity +
euaims_data$B4_pct_severity)/4
train_data = Dverbal
test_data = euaims_data
mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]], na.rm=TRUE)
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]], na.rm=TRUE)
c(mean2use, sd2use)
## [1] -0.01045243 0.19482749
tmp_train = make_subtype(data2use = train_data,
z_thresh = z_thresh,
mean2use = mean2use,
sd2use = sd2use)
tmp_test = make_subtype(data2use = test_data,
z_thresh = z_thresh,
mean2use = mean2use,
sd2use = sd2use)
# compute model
mod2use = svm(x = tmp_train[,vars2use], y = tmp_train$z_ds_group)
pred_labels = predict(mod2use, tmp_test[,vars2use])
confmat = table(tmp_test$z_ds_group,pred_labels)
acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
tmp_test$svm_pred_labels = pred_labels
# plot confusion matrix
setHook("grid.newpage", function() pushViewport(viewport(x=1,y=1,width=0.9, height=0.9, name="vp", just=c("right","top"))), action="prepend")
pheatmap(confmat/rowSums(confmat)*100, display_numbers = TRUE, color = colorRampPalette(c('white','red'))(100), cluster_rows = FALSE, cluster_cols = FALSE, fontsize_number = 15,labels_row = c("RRB>SC","SC=RRB","SC>RRB"),labels_col = c("RRB>SC","SC=RRB","SC>RRB"),angle_col=90)
setHook("grid.newpage", NULL, "replace")
grid::grid.text("Actual Labels", y=-0.07, gp=gpar(fontsize=16))
grid::grid.text("Predicted Labels", x=-0.07, rot=90, gp=gpar(fontsize=16))
# scatterplot
p1 = ggplot(data = tmp_train, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(z_ds_group))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("NDAR ALL")
p1
p2 = ggplot(data = tmp_test, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(z_ds_group))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("EU-AIMS with Groups from NDAR ALL")
p2
p3 = ggplot(data = tmp_test, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(pred_labels))) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("EU-AIMS")
p5_bottom_right = p3 + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_EUAIMS_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p5_bottom_right)
p3
# # write out EU-AIMS LEAP data with subgroups defined by NDAR All
write.csv(tmp_test, file.path(datapath, sprintf("tidy_euaims_NDAR_subtypes_diffscore_z%s.csv",as.character(z_thresh))))
Make final plot
p_final = p1_top_left + p3_middle_top + plot_spacer() + p2_bottom_left + p4_middle_bottom + p5_bottom_right + plot_layout(nrow=3, ncol=3, widths = c(4,4,4), heights = c(8,8,8))
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_EUAIMS_subtypes_plot_z%s.pdf",as.character(z_thresh))), plot = p_final)
p_final
Integrate subtypes with rest of EU-AIMS LEAP data
euaims_data = read.csv(file.path(datapath,"tidy_euaims.csv"))
td_df = subset(euaims_data, euaims_data$Diagnosis=="TD",select=2:6)
td_df$A_pct_severity = as.numeric(NA)
td_df$B_pct_severity = as.numeric(NA)
td_df$subgrp = "TD"
cols2use = c("subid","age","Centre","Schedule","Diagnosis","dbaes_atotal","dbaes_btotal","svm_pred_labels")
tmp_asd = tmp_test[,cols2use]
colnames(tmp_asd)[6] = "A_pct_severity"
colnames(tmp_asd)[7] = "B_pct_severity"
colnames(tmp_asd)[8] = "subgrp"
asd_df = tmp_asd
all_data = rbind(td_df,asd_df)
fname = "/Users/mlombardo/Dropbox/euaims/data/rsfmri_preproc/euaims_preproc.xlsx"
pp_data = read_excel(fname)
mask = pp_data$notes=="ok"
pp_data = subset(pp_data, mask)
asd_df = merge(pp_data[,c("subid","sex")],asd_df, by = "subid")
td_df = merge(pp_data[,c("subid","sex")],td_df, by = "subid")
all_data = rbind(td_df,asd_df)
data2write = merge(pp_data, all_data, by = "subid")
data2write$age = data2write$age.x
data2write$sex = data2write$sex.y
print(table(data2write$Schedule, data2write$subgrp))
##
## RRB_over_SC SC_equal_RRB SC_over_RRB TD
## A 7 34 39 78
## B 4 37 44 83
## C 4 28 31 59
## D 1 7 30 23
print(table(data2write$Centre, data2write$subgrp))
##
## RRB_over_SC SC_equal_RRB SC_over_RRB TD
## CAMBRIDGE 4 25 13 29
## KINGS_COLLEGE 8 42 56 78
## MANNHEIM 0 0 0 34
## NIJMEGEN 4 33 56 64
## UTRECHT 0 6 19 38
print(table(data2write$sex, data2write$subgrp))
##
## RRB_over_SC SC_equal_RRB SC_over_RRB TD
## Female 6 27 38 88
## Male 10 79 106 155
#DSM-5 - find best split that balances participants across sites
a = findBestSplit(asd_df, seed_range = c(172342)) #,300001:500000))
## [1] 172342
best_seeds = a$seed[!is.na(a$discrepancy) & a$discrepancy==min(a$discrepancy, na.rm = TRUE)]
print(best_seeds)
## [1] 172342
# Split datasets -------------------------------------------------------------
rngSeed = best_seeds[1]
# split Schedule A dataset
dset2use = subset(asd_df, asd_df$Schedule=="A")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
A_Discovery = tmp_d[[2]]
A_Replication = tmp_d[[1]]
# split Schedule B dataset
dset2use = subset(asd_df, asd_df$Schedule=="B")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
B_Discovery = tmp_d[[2]]
B_Replication = tmp_d[[1]]
# split Schedule C dataset
dset2use = subset(asd_df, asd_df$Schedule=="C")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
C_Discovery = tmp_d[[2]]
C_Replication = tmp_d[[1]]
# split Schedule D dataset
dset2use = subset(asd_df, asd_df$Schedule=="D")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
D_Discovery = tmp_d[[2]]
D_Replication = tmp_d[[1]]
df_Disc = rbind(A_Discovery, B_Discovery, C_Discovery, D_Discovery)
df_Rep = rbind(A_Replication, B_Replication, C_Replication, D_Replication)
a = table(df_Disc$Schedule, df_Disc$Centre); print(a)
##
## CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
## A 6 17 10 7
## B 9 16 14 3
## C 6 9 14 2
## D 0 10 10 0
b = table(df_Rep$Schedule, df_Rep$Centre); print(b)
##
## CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
## A 7 18 10 5
## B 8 17 13 5
## C 6 10 13 3
## D 0 9 9 0
print(a-b)
##
## CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
## A -1 -1 0 2
## B 1 -1 1 -2
## C 0 -1 1 -1
## D 0 1 1 0
print(sum(rowSums(abs(a-b))))
## [1] 14
data2write$dataset = NA
mask = is.element(data2write$subid,df_Disc$subid)
data2write[mask,"dataset"] = "Discovery"
mask = is.element(data2write$subid,df_Rep$subid)
data2write[mask,"dataset"] = "Replication"
asd_Disc = subset(data2write, data2write$dataset=="Discovery" & data2write$Diagnosis=="ASD")
asd_Rep = subset(data2write, data2write$dataset=="Replication" & data2write$Diagnosis=="ASD")
# find which seed gives best TD age-match -------------------------------------
seeds = 1:1000
pvals = data.frame(matrix(nrow = length(seeds), ncol = 2))
for (i in 1:length(seeds)) {
res = findTDAgeMatch(data2write, seed_range = c(seeds[i],seeds[i]))
td_Disc_matched = res[[2]]
td_Rep_matched = res[[1]]
tres = t.test(td_Disc_matched$age, asd_Disc$age)
pvals[i,1] = tres$p.value
tres = t.test(td_Rep_matched$age, asd_Rep$age)
pvals[i,2] = tres$p.value
#print(i)
}
a = sort.int(pvals[,1], decreasing = TRUE, index.return = TRUE)
b=pvals[a$ix,]
seed2use = 929
res = findTDAgeMatch(data2write, seed_range = c(seed2use,seed2use))
td_Disc_matched = res[[2]]
td_Rep_matched = res[[1]]
mask = is.element(data2write$subid, td_Disc_matched$subid)
data2write$dataset[mask] = "Discovery"
mask = is.element(data2write$subid, td_Rep_matched$subid)
data2write$dataset[mask] = "Replication"
print(table(data2write$dataset, data2write$subgrp))
##
## RRB_over_SC SC_equal_RRB SC_over_RRB TD
## Discovery 11 52 70 121
## Replication 5 54 74 122
fname2save = here(sprintf("asd_subgrp_data_rsfmri_ALL_DSM5_diffzscoreGrps_z%s.csv",as.character(z_thresh)))
write.csv(data2write,file = fname2save)
Plot mean FD and hypothesis test
df2use = subset(data2write, data2write$subgrp!="RRB_over_SC")
p = ggplot(data = df2use, aes(x = subgrp, y =meanFD, colour = subgrp)) + facet_wrap(. ~ dataset)
p = p + geom_jitter() + geom_boxplot(fill = NA, colour = "#000000", outlier.shape = NA) + guides(colour = FALSE)
p = p + xlab("Group") + ylab("Mean FD")
p
# Hypothesis test
df2use = data2write
y_var = "meanFD"
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df2use, na.action = na.omit)))
# run ANOVA
res = anova(mod2use)
res
## numDF denDF F-value p-value
## (Intercept) 1 501 118.99740 <.0001
## subgrp 3 501 1.16334 0.3232
# Discovery
df4mod = subset(df2use,df2use$dataset=="Discovery" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))
# run ANOVA
res = anova(mod2use)
res
## numDF denDF F-value p-value
## (Intercept) 1 246 145.29571 <.0001
## subgrp 3 246 2.52924 0.0579
# Replication
df4mod = subset(df2use,df2use$dataset=="Replication" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))
# run ANOVA
res = anova(mod2use)
res
## numDF denDF F-value p-value
## (Intercept) 1 247 68.77640 <.0001
## subgrp 3 247 0.23437 0.8724
Plot ADOS scores in the subtypes and also run a hypothesis test
df2use = subset(data2write, data2write$subgrp!="TD")
df2use$ados_2_SA_CSS[df2use$ados_2_SA_CSS==999] = NA
df2use$ados_2_RRB_CSS[df2use$ados_2_RRB_CSS==999] = NA
df4plot = melt(df2use,
id.vars = c("subid","subgrp"),
measure.vars = c("ados_2_SA_CSS","ados_2_RRB_CSS"))
df4plot$variable = as.character(df4plot$variable)
df4plot$variable[df4plot$variable=="ados_2_SA_CSS"] = "SA"
df4plot$variable[df4plot$variable=="ados_2_RRB_CSS"] = "RRB"
df4plot$variable = factor(df4plot$variable)
p = ggplot(data = df4plot, aes(x = variable,
y = value,
colour = subgrp,
group = subid)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + guides(color=FALSE)
p = p + ylab("ADOS CSS") + xlab("ADOS Subscale")
ggsave(filename = file.path(plotpath, sprintf("final_EUAIMS_ADOS_jitterplot_z%s.pdf",as.character(z_thresh))))
p
# hypothsis test on ADOS SA CSS
y_var = "ados_2_SA_CSS"
df2use = subset(data2write, !is.element(data2write$subgrp,c("TD")))
# Hypothesis test
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df2use, na.action = na.omit)))
# run ANOVA
res = anova(mod2use)
res
## numDF denDF F-value p-value
## (Intercept) 1 260 9.643933 0.0021
## subgrp 2 260 0.801596 0.4497
# hypothsis test on ADOS RRB CSS
y_var = "ados_2_RRB_CSS"
df2use = subset(data2write, !is.element(data2write$subgrp,c("TD")))
# Hypothesis test
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df2use, na.action = na.omit)))
# run ANOVA
res = anova(mod2use)
res
## numDF denDF F-value p-value
## (Intercept) 1 260 9.855800 0.0019
## subgrp 2 260 0.780035 0.4595
Descriptive stats
df2use = data2write[,c("subid","Diagnosis","dataset","Centre","meanFD","age","sex","subgrp","viq_all","piq_all","fsiq4_all","ados_2_SA_CSS","ados_2_RRB_CSS","SRS_tscore","SRS_tscore_self")]
mask = df2use==999
df2use[mask] = NA
df2use$age = df2use$age/365
cols2use = c("dataset","subgrp","age","meanFD","viq_all","piq_all","fsiq4_all","ados_2_SA_CSS","ados_2_RRB_CSS","SRS_tscore","SRS_tscore_self")
res=describeBy(x = df2use[,cols2use], group = c("subgrp"))
res
##
## Descriptive statistics by group
## group: RRB_over_SC
## vars n mean sd median trimmed mad min max range
## dataset* 1 16 NaN NA NA NaN NA Inf -Inf -Inf
## subgrp* 2 16 NaN NA NA NaN NA Inf -Inf -Inf
## age 3 16 16.81 5.64 16.52 16.93 8.19 7.89 24.03 16.14
## meanFD 4 16 0.24 0.25 0.18 0.18 0.07 0.06 1.14 1.09
## viq_all 5 16 100.54 17.69 97.50 99.47 15.57 73.00 143.00 70.00
## piq_all 6 16 102.05 18.21 100.50 101.49 14.08 64.00 148.00 84.00
## fsiq4_all 7 16 101.55 17.18 100.36 100.20 12.60 74.00 148.00 74.00
## ados_2_SA_CSS 8 16 4.44 2.83 3.50 4.36 3.71 1.00 9.00 8.00
## ados_2_RRB_CSS 9 16 3.69 3.32 1.00 3.50 0.00 1.00 9.00 8.00
## SRS_tscore 10 11 70.45 12.10 72.00 70.78 8.90 48.00 90.00 42.00
## SRS_tscore_self 11 6 61.17 6.52 63.50 61.17 3.71 49.00 67.00 18.00
## skew kurtosis se
## dataset* NA NA NA
## subgrp* NA NA NA
## age -0.10 -1.63 1.41
## meanFD 2.77 7.09 0.06
## viq_all 0.57 -0.07 4.42
## piq_all 0.45 0.89 4.55
## fsiq4_all 0.90 1.04 4.29
## ados_2_SA_CSS 0.21 -1.64 0.71
## ados_2_RRB_CSS 0.48 -1.64 0.83
## SRS_tscore -0.16 -0.90 3.65
## SRS_tscore_self -0.92 -0.85 2.66
## ------------------------------------------------------------
## group: SC_equal_RRB
## vars n mean sd median trimmed mad min max range
## dataset* 1 106 NaN NA NA NaN NA Inf -Inf -Inf
## subgrp* 2 106 NaN NA NA NaN NA Inf -Inf -Inf
## age 3 106 16.40 6.07 15.42 16.02 6.04 7.08 30.28 23.20
## meanFD 4 106 0.26 0.39 0.19 0.21 0.12 0.04 3.95 3.91
## viq_all 5 104 100.32 16.86 102.33 100.76 15.83 64.00 136.00 72.00
## piq_all 6 104 102.96 18.38 105.30 103.61 18.83 61.00 142.00 81.00
## fsiq4_all 7 105 101.91 16.78 105.00 102.53 17.79 60.00 131.00 71.00
## ados_2_SA_CSS 8 104 5.84 2.58 6.00 5.88 2.97 1.00 10.00 9.00
## ados_2_RRB_CSS 9 104 5.02 2.63 5.00 5.02 2.97 1.00 10.00 9.00
## SRS_tscore 10 97 68.86 11.95 69.00 68.84 14.83 43.00 90.00 47.00
## SRS_tscore_self 11 54 61.41 9.80 61.50 61.25 11.12 43.00 89.00 46.00
## skew kurtosis se
## dataset* NA NA NA
## subgrp* NA NA NA
## age 0.48 -0.68 0.59
## meanFD 7.80 69.70 0.04
## viq_all -0.22 -0.65 1.65
## piq_all -0.31 -0.58 1.80
## fsiq4_all -0.33 -0.69 1.64
## ados_2_SA_CSS -0.16 -1.05 0.25
## ados_2_RRB_CSS -0.36 -0.87 0.26
## SRS_tscore -0.02 -1.01 1.21
## SRS_tscore_self 0.18 -0.29 1.33
## ------------------------------------------------------------
## group: SC_over_RRB
## vars n mean sd median trimmed mad min max range
## dataset* 1 144 NaN NA NA NaN NA Inf -Inf -Inf
## subgrp* 2 144 NaN NA NA NaN NA Inf -Inf -Inf
## age 3 144 16.32 5.03 15.99 16.17 5.61 7.48 29.4 21.92
## meanFD 4 144 0.25 0.27 0.16 0.19 0.11 0.03 1.6 1.56
## viq_all 5 140 96.94 19.58 99.00 97.40 20.74 50.91 142.0 91.09
## piq_all 6 142 98.49 22.24 102.49 99.66 21.31 44.03 150.0 105.97
## fsiq4_all 7 142 98.06 19.78 102.25 98.76 20.20 59.00 143.0 84.00
## ados_2_SA_CSS 8 138 6.31 2.64 7.00 6.44 2.97 1.00 10.0 9.00
## ados_2_RRB_CSS 9 138 4.70 2.75 5.00 4.62 2.97 1.00 10.0 9.00
## SRS_tscore 10 124 73.48 11.71 75.50 74.13 12.60 44.00 95.0 51.00
## SRS_tscore_self 11 63 63.10 10.88 61.00 62.27 8.90 40.00 94.0 54.00
## skew kurtosis se
## dataset* NA NA NA
## subgrp* NA NA NA
## age 0.30 -0.54 0.42
## meanFD 2.95 9.71 0.02
## viq_all -0.19 -0.54 1.65
## piq_all -0.41 -0.51 1.87
## fsiq4_all -0.31 -0.82 1.66
## ados_2_SA_CSS -0.36 -0.92 0.22
## ados_2_RRB_CSS -0.20 -1.14 0.23
## SRS_tscore -0.38 -0.54 1.05
## SRS_tscore_self 0.68 0.55 1.37
## ------------------------------------------------------------
## group: TD
## vars n mean sd median trimmed mad min max range
## dataset* 1 243 NaN NA NA NaN NA Inf -Inf -Inf
## subgrp* 2 243 NaN NA NA NaN NA Inf -Inf -Inf
## age 3 243 16.84 5.66 16.58 16.63 6.57 6.89 29.84 22.95
## meanFD 4 243 0.20 0.34 0.13 0.15 0.07 0.03 4.60 4.58
## viq_all 5 241 104.27 18.62 106.00 105.33 14.83 45.00 160.00 115.00
## piq_all 6 241 105.36 18.92 107.00 107.07 16.31 49.00 147.00 98.00
## fsiq4_all 7 241 105.33 17.70 108.18 107.16 13.61 50.00 142.00 92.00
## ados_2_SA_CSS 8 0 NaN NA NA NaN NA Inf -Inf -Inf
## ados_2_RRB_CSS 9 0 NaN NA NA NaN NA Inf -Inf -Inf
## SRS_tscore 10 133 47.54 9.34 44.00 45.82 4.45 37.00 90.00 53.00
## SRS_tscore_self 11 132 47.50 5.90 46.00 46.72 5.19 39.00 69.00 30.00
## skew kurtosis se
## dataset* NA NA NA
## subgrp* NA NA NA
## age 0.28 -0.74 0.36
## meanFD 9.47 111.25 0.02
## viq_all -0.55 0.92 1.20
## piq_all -0.80 0.53 1.22
## fsiq4_all -0.96 1.13 1.14
## ados_2_SA_CSS NA NA NA
## ados_2_RRB_CSS NA NA NA
## SRS_tscore 1.85 3.60 0.81
## SRS_tscore_self 1.18 1.39 0.51
Model age differences
y_var = "age"
# df4mod = subset(df2use,df2use$subgrp!="RRBoverSC")
df4mod = df2use
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))
# run ANOVA
res = anova(mod2use)
res
## numDF denDF F-value p-value
## (Intercept) 1 501 792.5869 <.0001
## subgrp 3 501 0.5403 0.6549
# Discovery
df4mod = subset(df2use,df2use$dataset=="Discovery" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))
# run ANOVA
res = anova(mod2use)
res
## numDF denDF F-value p-value
## (Intercept) 1 246 602.0249 <.0001
## subgrp 3 246 0.3993 0.7536
# Replication
df4mod = subset(df2use,df2use$dataset=="Replication" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))
# run ANOVA
res = anova(mod2use)
res
## numDF denDF F-value p-value
## (Intercept) 1 247 505.9907 <.0001
## subgrp 3 247 0.5217 0.6677
Model VIQ differences
y_var = "viq_all"
# df4mod = subset(df2use,df2use$subgrp!="RRBoverSC")
df4mod = df2use
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))
# run ANOVA
res = anova(mod2use)
res
## numDF denDF F-value p-value
## (Intercept) 1 493 1961.9135 <.0001
## subgrp 3 493 3.6635 0.0124
# Discovery
df4mod = subset(df2use,df2use$dataset=="Discovery" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))
# run ANOVA
res = anova(mod2use)
res
## numDF denDF F-value p-value
## (Intercept) 1 242 2229.0030 <.0001
## subgrp 3 242 2.0844 0.1029
# Replication
df4mod = subset(df2use,df2use$dataset=="Replication" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))
# run ANOVA
res = anova(mod2use)
res
## numDF denDF F-value p-value
## (Intercept) 1 243 1477.9386 <.0001
## subgrp 3 243 2.1664 0.0926
Model PIQ differences
y_var = "piq_all"
# df4mod = subset(df2use,df2use$subgrp!="RRBoverSC")
df4mod = df2use
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))
# run ANOVA
res = anova(mod2use)
res
## numDF denDF F-value p-value
## (Intercept) 1 495 1478.8200 <.0001
## subgrp 3 495 1.9176 0.1258
# Discovery
df4mod = subset(df2use,df2use$dataset=="Discovery" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))
# run ANOVA
res = anova(mod2use)
res
## numDF denDF F-value p-value
## (Intercept) 1 242 1885.666 <.0001
## subgrp 3 242 1.755 0.1564
# Replication
df4mod = subset(df2use,df2use$dataset=="Replication" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))
# run ANOVA
res = anova(mod2use)
res
## numDF denDF F-value p-value
## (Intercept) 1 245 1169.8593 <.0001
## subgrp 3 245 1.0994 0.35
Model FIQ differences
y_var = "fsiq4_all"
# df4mod = subset(df2use,df2use$subgrp!="RRBoverSC")
df4mod = df2use
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))
# run ANOVA
res = anova(mod2use)
res
## numDF denDF F-value p-value
## (Intercept) 1 496 2055.9457 <.0001
## subgrp 3 496 3.0199 0.0295
# Discovery
df4mod = subset(df2use,df2use$dataset=="Discovery" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))
# run ANOVA
res = anova(mod2use)
res
## numDF denDF F-value p-value
## (Intercept) 1 244 2758.4506 <.0001
## subgrp 3 244 2.3434 0.0737
# Replication
df4mod = subset(df2use,df2use$dataset=="Replication" & df2use$subgrp!="RRBoverSC")
# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))
# run ANOVA
res = anova(mod2use)
res
## numDF denDF F-value p-value
## (Intercept) 1 244 1450.1789 <.0001
## subgrp 3 244 1.5459 0.2032